import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
#Reading the data
weather_actual = pd.read_csv('weather_actuals.csv',index_col=0)
weather_forecast = pd.read_csv('weather_forecast.csv',index_col=0)
power_data = pd.read_csv('power_actual.csv',index_col=0)
weather_actual.head()
power_data.head()
weather_actual.shape
power_data.shape
weather_actual.describe().loc['mean'] #we can see that there are many columns who has significant missing data.Some columns has false value -9999
weather_forecast.info()
#Removing the features whose data is missing significantly.
null_columns=weather_actual.columns[weather_actual.isnull().any()]
weather_actual[null_columns].isnull().sum()
weather_actual = weather_actual.drop(null_columns,axis=1)
#Removing the same features from weather_forecast data
weather_forecast = weather_forecast.drop(null_columns,axis=1)
#Replacing -9999 with NaN values
weather_actual.replace(to_replace = -9999 , value =np.NaN,inplace=True)
#making the list of columns that has NaN value
incorrect_value_columns = weather_actual.columns[weather_actual.isna().any()].tolist()
#Checking the distribution of the columns having incorrect data
j = 1
for i in incorrect_value_columns:
plt.subplot(5,2,j)
weather_actual[i].hist(bins=50, figsize=(10,14))
plt.xlabel(i)
plt.tight_layout()
j +=1
#Interpolate the incorrect values using linear method.
#Have tried various method.
#In this method the distribution of the data didnt change.
weather_actual[incorrect_value_columns] = weather_actual[incorrect_value_columns].interpolate(method = 'linear', limit_direction = 'backward')
#All the null value from the dataset is removed and incorrected replaced
weather_actual.isnull().sum()
weather_actual.shape
weather_actual.columns
#Plotting the boxplot to check the outliers
col = ['cloud_cover',
'apparent_temperature', 'temperature', 'humidity', 'dew_point',
'wind_bearing', 'wind_speed', 'wind_gust', 'pressure', 'uv_index',
'ozone', 'precip_intensity', 'precip_probability', 'visibility']
j = 0
for i in col:
#j = j +1
plt.figure(figsize=(3,2))
sns.boxplot(weather_actual[i])
plt.xlabel(i)
plt.tight_layout()
# #From Boxplot we can see that dew_point,wind_speed,wind_gust,uv_index,precip_intensity,precip_probability have outliers
## precip_intensity and precip_probability feature has 99.3% value around 0. So IQR for these features will be 0. We cannot cap al the values to 0
# so we will either remove the that values from the dataset after merging weather_actual with power_data or we will keep it.
col_with_outliers = ['dew_point','wind_speed', 'wind_gust','uv_index']
Q1 = weather_actual[col_with_outliers].quantile(0.25)
Q3 = weather_actual[col_with_outliers].quantile(0.75)
IQR = Q3 - Q1
#Only dew_point features has low outliers. So capping the Q1 value of dew_point feature to its low outliers.
IQR_dew_point = Q1.loc['dew_point'] - 1.5 * IQR.loc['dew_point']
weather_actual['dew_point'].loc[weather_actual['dew_point'] <= IQR_dew_point] = IQR_dew_point
#Now capping the value of other features having high outliers.
for i in col_with_outliers:
IQR1 = Q3.loc[i] + 1.5*IQR.loc[i]
weather_actual[i].loc[weather_actual[i] >= IQR1] = IQR1
#print(IQR)
power_data.isnull().sum()
#Since ghi and gti have missing value, dropping both columns
power_data.drop(['ghi','gti'],axis =1 ,inplace=True)
power_data.describe()
#All the values in the weather actual data is given hourly whereas in power data the power is given in every 15 min
#Suming the power hourwise
power_data['datetime'] = pd.to_datetime(power_data['datetime'])
power_data['hour'] = power_data['datetime'].dt.hour
#power_data['min'] = power_data['datetime'].dt.minute
power_data['date'] = power_data['datetime'].dt.date
power_data = power_data.groupby(['date','hour'],sort=False).power.sum().to_frame().reset_index()
#when we check the shape of the power data there are 17520 rows and in weather actual data there are 13619 rows.
#That means in weather actual data some value dates are missing.In power data for 730 days the rows should be 17520 which is correct
#Missing rows = 17520 - 13619 = 3901
power_data.shape
#Comparing the dates in power data and weather data
#Making a list of the dates that are not present in weather data
weather_actual['datetime_local'] = pd.to_datetime(weather_actual['datetime_local'])
weather_actual['date'] = weather_actual['datetime_local'].dt.date
missing_dates = power_data.date[~power_data.date.isin(weather_actual.date)]
missing_dates = missing_dates.unique()
#Checking the value count of all the days in weather data. If it is not 24 then storing it in one variable. So that we can remove entire date data.
Missing_hour_data_date = (weather_actual['date'].value_counts() != 24)[(weather_actual['date'].value_counts() != 24)==True].index
#Removing the dates from the weather actual data whose whole day data is missing
Missing_hour_data_date = pd.Series(Missing_hour_data_date)
weather_actual = weather_actual[weather_actual['date'] != Missing_hour_data_date[0]].reset_index(drop =True)
weather_actual.shape
#Appending the missing hour data date to missing dates. We will remove these dates from power data.
missing_dates = np.append(missing_dates, np.array(Missing_hour_data_date))
#We will remove missing dates from the power data
power_data = power_data[~power_data['date'].isin(missing_dates)].reset_index(drop=True)
weather_actual.shape
#Checking the shape and wether dates in both dataset are same or not
print('Shape of weather_data', weather_actual.shape)
print('Shape of power_data', power_data.shape)
print(power_data.date[~power_data.date.isin(weather_actual.date)])
EDA on power data
#Checking for outliers.
sns.boxplot(power_data['power'])
sns.scatterplot(x='hour',y='power',data=power_data)
From the boxplot and scatter plot we can say that there are extreme high outliers present in the power data.
From the scatter plot we can say that high power values (outliers) are present during the day time ( ie. between 6am to 4pm ) and we can cap these values to Q3 + 1.5 IQR.
Instead we will drop these dates data from both dataset to avoid model to learn from incorrect values.
#The outliers are extreme high and every few. So we will remove the dates whose power values are more (Q3 + 1.5 IQR)
Q1 = power_data['power'].quantile(0.25)
Q3 = power_data['power'].quantile(0.75)
IQR = Q3 - Q1
upper_limit = Q3 + 1.5 * IQR
print('Upper_limit is', upper_limit)
#collecting the dates whose power values are wrong
incorrect_power_date = power_data[power_data['power'] > upper_limit]['date'].unique()
#Removing the incorrect power value data from both dataset
power_data = power_data[~power_data['date'].isin(incorrect_power_date)].reset_index()
weather_actual = weather_actual[~weather_actual['date'].isin(incorrect_power_date)].reset_index(drop=True)
#Checking the shape and wether dates in both dataset are same or not
print('Shape of weather_data', weather_actual.shape)
print('Shape of power_data', power_data.shape)
#print(power_data.date[~power_data.date.isin(weather_actual.date)])
weather_actual = weather_actual.merge(power_data['power'], right_index=True,left_index=True)
#Checking the null values
weather_actual.isnull().sum()
#Checking the correlation between features and target variable
#There is nearly 0 correltaion between the features and target variables.
plt.subplots(figsize=(15,15))
sns.heatmap(weather_actual.corr(), annot=True)
plt.tight_layout()
sns.pairplot(weather_actual)
Feature Engineering
Creating a function for removing irrelavant features and adding some new feature.
weather_actual.columns
#MRemoving date column to make the features between weather actual and weather forecast same.
weather_actual = weather_actual.drop('date',axis=1)
def feature_eng(weather_actual):
#dropping all the irrelevant columns first
weather_actual.drop(['plant_id','datetime_utc','updated_at','apparent_temperature','summary'],axis=1,inplace=True)
#Making the list of timestamp columns to convert them into datetime format and apply some changes
time_col_lst = ['datetime_local','sunrise','sunset']
for i in time_col_lst:
weather_actual[i] = pd.to_datetime(weather_actual[i])
#Instead of whole timestamp, we can use only hour information
weather_actual['hour'] = weather_actual['datetime_local'].dt.hour
#Calculating the sun duration in minutes. One column instead of two.
weather_actual['sun_duration'] = (weather_actual['sunset'] - weather_actual['sunrise'])/np.timedelta64(1,'m')
#Droping the time column list
weather_actual.drop(time_col_lst,axis=1,inplace =True)
#Label Encoding icon feature
weather_actual['icon'] = weather_actual['icon'].astype('category')
weather_actual['icon'] = weather_actual['icon'].cat.codes
return (weather_actual)
#Applying feature engineering on the weather actual data
weather_actual =feature_eng(weather_actual)
#Checking the columns
weather_actual.columns
Applying few ML models and checking their performance on test data. Based on it we will select best performing model for predicting weather forecast data
Spliting the data
from sklearn.model_selection import train_test_split
from sklearn import metrics
y = weather_actual['power']
X = weather_actual.drop(['power'],axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)
Linear Regression
#Importing and fitting Linear Regression Model
from sklearn.linear_model import LinearRegression
lm = LinearRegression(normalize=True)
lm.fit(X_train,y_train)
#List of the coefficients for each features
coeff_df = pd.DataFrame(lm.coef_,X.columns,columns=['Coefficient'])
print (coeff_df)
print('lm intercept',lm.intercept_)
#Predicting the values for test set
y_pred_lr = lm.predict(X_test)
#Plotting the test and predicted value. To see how model performed
plt.scatter(y_test,y_pred_lr)
#Evaluting the model
MAE_Regression = metrics.mean_absolute_error(y_test, y_pred_lr)
MSE_Regression = metrics.mean_squared_error(y_test, y_pred_lr)
RMSE_Regression = np.sqrt(metrics.mean_squared_error(y_test, y_pred_lr))
print('MAE:', MAE_Regression)
print('MSE:', MSE_Regression)
print('RMSE:', RMSE_Regression)
LASSO
#Importing and fitting Lasso Regression Model
from sklearn.linear_model import Lasso
model_lasso = Lasso()
model_lasso.fit(X_train,y_train)
#Predicting the values for test set
y_pred_lasso = model_lasso.predict(X_test)
#Plotting the test and predicted value. To see how model performed
plt.scatter(y_test,y_pred_lasso)
#Evaluting the model
MAE_lasso = metrics.mean_absolute_error(y_test, y_pred_lasso)
MSE_lasso = metrics.mean_squared_error(y_test, y_pred_lasso)
RMSE_lasso = np.sqrt(metrics.mean_squared_error(y_test, y_pred_lasso))
print('MAE:', MAE_lasso)
print('MSE:', MSE_lasso)
print('RMSE:', RMSE_lasso)
Linear Regression Using Polynomial Features
#Creating function so that we can test the preformance of the model for different degrees.
#After checking we got best result for degree 3
from sklearn.preprocessing import PolynomialFeatures
def polynomial_regression_model(degree):
poly_features = PolynomialFeatures(degree=degree)
# transforms the existing features to higher degree features.
X_train_poly = poly_features.fit_transform(X_train)
# fit the transformed features to Linear Regression
poly_model = LinearRegression()
poly_model.fit(X_train_poly, y_train)
# predicting on training data-set
y_train_predicted = poly_model.predict(X_train_poly)
# predicting on test data-set
y_test_predict = poly_model.predict(poly_features.fit_transform(X_test))
return (y_test_predict)
#Calling the function which returns predicted value
y_pred_poly = polynomial_regression_model(3)
#Plotting the test and predicted value. To see how model performed
plt.scatter(y_test,y_pred_poly)
#Evaluting the model
MAE_poly = metrics.mean_absolute_error(y_test, y_pred_poly)
MSE_poly = metrics.mean_squared_error(y_test, y_pred_poly)
RMSE_poly = np.sqrt(metrics.mean_squared_error(y_test, y_pred_poly))
print('MAE:', MAE_poly)
print('MSE:', MSE_poly)
print('RMSE:', RMSE_poly)
RandomForest Reggrosor
#Importing and fitting RandomForestRegressor Model
from sklearn.ensemble import RandomForestRegressor
RF = RandomForestRegressor()
RF.fit(X_train,y_train)
#Predicting the values for test set
y_pred_RF = RF.predict(X_test)
#Plotting the test and predicted value. To see how model performed
plt.scatter(y_test,y_pred_RF)
#Evaluting the model
MAE_RF = metrics.mean_absolute_error(y_test, y_pred_RF)
MSE_RF = metrics.mean_squared_error(y_test, y_pred_RF)
RMSE_RF = np.sqrt(metrics.mean_squared_error(y_test, y_pred_RF))
print('MAE:', MAE_RF)
print('MSE:', MSE_RF)
print('RMSE:', RMSE_RF)
SVM Regressor
Tried predicting value using ('linear','poly','rbf') kernel. Among them 'rbf' gave the best result
#Importing and fitting SVR Model
from sklearn.svm import SVR
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
svmr_lr = make_pipeline(StandardScaler(),SVR(kernel='rbf'))
svmr_lr.fit(X_train,y_train)
#Predicting the values for test set
y_pred_svmr = svmr_lr.predict(X_test)
#Plotting the test and predicted value. To see how model performed
plt.scatter(y_test,y_pred_svmr)
#Evaluting the model
MAE_svr = metrics.mean_absolute_error(y_test, y_pred_svmr)
MSE_svr = metrics.mean_squared_error(y_test, y_pred_svmr)
RMSE_svr = np.sqrt(metrics.mean_squared_error(y_test, y_pred_svmr))
print('MAE:', MAE_svr)
print('MSE:', MSE_svr)
print('RMSE:', RMSE_svr)
#Making a dataframe of all the scores from the model
index = ['Linear Regression','Lasso','LR using Poly Features','RandomForestRegressor','SVR']
data = { 'Mean Absolute Error' : [MAE_Regression,MAE_lasso,MAE_poly,MAE_RF,MAE_svr] ,
'Mean Square Error' : [MSE_Regression,MSE_lasso,MSE_poly,MSE_RF,MSE_svr],
'Root Mean Square Error': [RMSE_Regression, RMSE_lasso,RMSE_poly,RMSE_RF,RMSE_svr]}
model_score = pd.DataFrame(data , index=index)
model_score
From the above the dataframe we can say that RandomForestRegressor gave best result. So we will use Random Forest Regressor on weather forecast data.
#Applying the feature eng to weather forecast data
weather_forecast = feature_eng(weather_forecast)
#Predicting power for the forecast data
pred_power = pd.DataFrame(RF.predict(weather_forecast))
#Exporting the predicted power in the excel sheet
#pred_power.to_excel('pred_power.xlsx', index=True)
pred_power.hist(bins=20)